# -*- coding: utf-8 -*-

#Some pieces of codes were taken from:https://datascienceplus.com/evaluation-of-topic-modeling-topic-coherence/
from pprint import pprint
import numpy as np
import os,re
import pyLDAvis.gensim
import warnings
warnings.filterwarnings('ignore')  # To ignore all warnings that arise here to enhance clarity

from gensim.models.coherencemodel import CoherenceModel
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary
from numpy import array

"""# Import the dataset
In this part, the corpus is read into a list (docs). Each file in the corpus is an element in this list.

"""

docs=[]
pattern='^[A-Z]{3}_[0-9]{2}_[0-9]{4}.txt$'
path='C:/Data/'
for (dirname, dirs, files) in os.walk(path):
   for filename in files:
        thefile = os.path.join(dirname,filename)
        with open (thefile,encoding='utf8',errors='ignore') as fin:
             if re.match(pattern, filename):

                doc=fin.read().strip('\n\t')
                docs.append(doc)       
print(len(docs))

"""# Define a function for tokenizing and lemmatizing
In this part, some pre-processing is done on the corpus to get rid of the words that useful information, such as function words like the, or digits. The corpus is also lemmatized to avoid sparsity.
"""

from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

def docs_preprocessor(docs):
    tokenizer = RegexpTokenizer(r'\w+')
    for idx in range(len(docs)):
        docs[idx] = docs[idx].lower()  # Convert to lowercase.
        docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

    # Remove numbers, but not words that contain numbers.
    docs = [[token for token in doc if not token.isdigit()] for doc in docs]
    
    # Remove words that are only one character.
    docs = [[token for token in doc if len(token) > 3] for doc in docs]
    
    # Lemmatize all words in documents.
    lemmatizer = WordNetLemmatizer()
    docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]
  
    return docs
# Perform function on our document
docs = docs_preprocessor(docs)

"""# Remove rare & common tokens """

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)
dictionary.filter_extremes(no_below=10, no_above=0.2)
#Create dictionary and corpus required for Topic Modeling
corpus = [dictionary.doc2bow(doc) for doc in docs]
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

"""# Build The Model"""

# Set parameters.
num_topics = 5
chunksize = 500 
passes = 20 
iterations = 400
eval_every = 1  

# Make a index to word dictionary.
temp = dictionary[0]  # only to "load" the dictionary.
id2word = dictionary.id2token

lda_model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                       alpha='auto', eta='auto', \
                       iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every)
# Print the Keyword in the 5 topics
print(lda_model.print_topics())

"""# Find the optimal number of topics"""

#Using c_v Measure
def compute_coherence_values(dictionary, corpus, texts, limit, start=10, step=5):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model=LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics)
        model_list.append(model)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

"""# Create a model list and plot Coherence score against a number of topics"""

model_list, coherence_values = compute_coherence_values(dictionary=dictionary, corpus=corpus, texts=docs, start=10, limit=50, step=5)
# Show graph
import matplotlib.pyplot as plt
limit=50; start=10; step=5;
x = range(start, limit, step)
plt.plot(x, coherence_values)
plt.xlabel("Num Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc='best')
plt.show()

"""The above plot shows that coherence score increases with the number of topics, with a decline after 15. so 15 is a decent choice for the number of topics."""

# Print the coherence scores
for m, cv in zip(x, coherence_values):
    print("Num Topics =", m, " has Coherence Value of", round(cv, 4))

"""# Selecting The Optimal Model"""

#finding the number of topics that gives the highest Coherence Value 
max_cv=max(coherence_values)
#finding the index of the model with this numnber of topics
index_model=coherence_values.index(max_cv)
#print(index_model)
# Select the model and print the topics 
optimal_model = model_list[index_model]
#model_topics = optimal_model.show_topics(formatted=False)
pprint(optimal_model.print_topics(num_words=10))

"""# Writing the topics along with the 10 most frequent words in a file"""

import re
with open('C:/Data/Topics.csv','w') as fw:
    fw.write("topic_id,term0,term1,term2,term3,term4,term5,term6,term7,term8,term9"+"\n")
    for i in optimal_model.print_topics():
        r1 = re.findall(r"\w+",i[1])
        line=str(i[0])
        for item in r1:
            if not item.isdigit():
                line=line+','+item
        fw.write(line+'\n')

"""# Displaying the file"""

from IPython.display import display
import pandas
df = pandas.read_csv("C:/Data/Topics.csv")
display(df)

"""This file is a csv file. Each line of the file contains a topic id along with 10 most common words for the topic.

# The file is fed to the paper's model and the Unsupervised Labels are:
"""

#The Labels are saved into a file named output_unsupervised.txt
#The following code changes the file into CSV format to be shown here
with open("C:/Labels/output_unsupervised.txt") as fi:
    content=fi.read().strip().split("top 3 labels for topic")

with open ("C:/Labels/Labels_unsupervised.txt","w") as fw:
    fw.write("Topic_id,Label_1,Label_2,Label_3"+'\n')
    Topic_id=0
    for i in content:
       i=i.split('\n')
       if len(i)>3:         
         fw.write(str(Topic_id)+','+i[1]+','+i[2]+','+i[3]+'\n')
         Topic_id+=1
    
df = pandas.read_csv("C:/Labels/Labels_unsupervised.txt")
display(df)

"""# The file is fed to the paper's model and the supervised Labels are:"""

#The Labels are saved into a file named output_supervised.txt
#The following code changes the file into CSV format to be shown here
with open("C:/Labels/output_supervised.txt") as fi:
    content=fi.read().strip().split("Top 3 labels for topic")

with open ("C:/Labels/Labels_supervised.txt","w") as fw:
    fw.write("Topic_id,Label_1,Label_2,Label_3"+'\n')
    Topic_id=0
    for i in content:
       i=i.split('\n')
       if len(i)>3:         
         fw.write(str(Topic_id)+','+i[1]+','+i[2]+','+i[3]+'\n')
         Topic_id+=1
    
df = pandas.read_csv("C:/Labels/Labels_supervised.txt")
display(df)

"""The Labels are produced using a neural embedding approach and Wikipedia titles. Each Wikipedia title is represented by two embeddings: doc2vec and word2vec. 

The doc2vec embedding of a title is the embedding of the document the label is associated with and its word2vec embedding is the result of generating word embeddings for the title. To this end, titles are treated as a single token (e.g. concatenating financial crisis into financial_crisis) and then the text of all of the Wikipedia articles are greedily tokenized, and the word embeddings for the tokens are built using SkipGram algorithm.

In order to generate labels, first, the relevant candidates are generated and then ranked.

 To generate the candidates, given a topic, the cosine similarity between the title embeddings (generated by either doc2vec or word2vec) and each of the word embeddings for the top-10 topic terms is calculated and aggregated by taking the arithmetic mean. The title which yields the highest similarity score is selected as the most relevant label for the topic.
 

The generated labels are stored in a file called output_candidates and ranked in two fashions:

1-Unsupervised: where the only feature used to rank labels is LetterTrigram (Kou et al., 2015)

2-The generated candidates are ranked by a support vector regression model (SVR: Joachims (2006) which is trained over four features using a gold standard order of candidates. The first two features are extracted using two algorithms, LetterTrigram (Kou et al., 2015) and PageRank (Page et al., 1998; the last two features are NumWords, which is simply the number of words in the candidate label (for example developing country has 2 words); and (2) TopicOverlap, which is the lexical overlap between the candidate label and the top-10 topic terms.

"""

